# Purpose: Bring in and analyze forecasting data collected from 280 subjects
# Updated and Modified : Mamoor Ali Khan
# Date Modified: Feb 26, 2025

rm(list = ls())

# Install and load pacman if not already done
if (!requireNamespace("pacman", quietly = TRUE)) {
   install.packages("pacman")
}

# Unload all user-loaded packages
# pacman::p_unload(all)
pacman::p_load(
   tidyverse, lubridate, readxl, openxlsx, estimatr, texreg, gridExtra, RColorBrewer,
   ggpubr, beeswarm, gt, modelsummary, nnet, here, extrafont, fastDummies
)




# =====Load and clean data from multiple surveys====
clean_excel <- function(data, wave_name) {
   data %>%
      rename(
         position = `1`,
         optimism = `2`,
         familiarity = `3`,
         ans_phone = `4`,
         ans_quest = `5`,
         call_eval_mpa = `6.1`,
         call_eval_gov = `6.2`,
         call_account = `6.3`,
         call_eval_mpa_rev = `6.1r`,
         call_eval_gov_rev = `6.2r`,
         call_account_rev = `6.3r`,
         responsive_eval_mpa = `7.1`,
         responsive_eval_gov = `7.2`,
         responsive_account = `7.3`,
         responsive_eval_mpa_rev = `7.1r`,
         responsive_eval_gov_rev = `7.2r`,
         responsive_account_rev = `7.3r`
      ) %>%
      mutate(
         id = 1:n(),
         wave = wave_name
      )
}

# Load NYU data
input <- read_excel("../1_data/4_forecasting/20190209_nyu.xlsx", na = c(".", "dk")) %>%
   clean_excel(., "NYU") %>%
   mutate(
      # Fix one typo
      call_eval_mpa = ifelse(call_eval_mpa == 0.8, 0.08, call_eval_mpa),
      position = recode(
         position,
         `f` = "faculty",
         `g` = "gradstudent",
         `pd` = "postdoc",
         .missing = "researchstaffother"
      )
   ) %>%
   mutate_at(
      vars(starts_with("ans_")),
      ~ {
         . / 100
      }
   )

filled_evals <- input %>%
   select(id, starts_with("call_"), starts_with("responsive_")) %>%
   gather(key, val, -id) %>%
   mutate(
      revision = grepl("rev$", key),
      key = gsub("_rev", "", key)
   ) %>%
   spread(key, val) %>%
   group_by(id) %>%
   mutate_at(
      vars(starts_with("call_"), starts_with("responsive_")),
      list(~ case_when(
         revision & is.na(.) & (length(setdiff(., NA)) == 1) ~ .[!revision],
         TRUE ~ .
      ))
   ) %>%
   gather(key, val, -id, -revision) %>%
   mutate(key = paste0(key, ifelse(revision, "_rev", ""))) %>%
   select(-revision) %>%
   spread(key, val)

input <- left_join(
   input %>% select(-starts_with("call_"), -starts_with("responsive_")),
   filled_evals
)

clean_qualtrics <- function(data, wave_chr) {
   data %>%
      filter(Status != "Survey Preview", Finished, Q1 == "Yes") %>%
      rename(
         call_eval_mpa = Q16_1,
         call_eval_gov = Q16_2,
         call_account = Q16_3,
         responsive_eval_mpa = Q15_1,
         responsive_eval_gov = Q15_2,
         responsive_account = Q15_3,
         position = Q2,
         optimism = Q8,
         familiarity = Q9,
         call_eval_mpa_rev = Q17_1,
         call_eval_gov_rev = Q27_1,
         call_account_rev = Q28_1,
         responsive_eval_mpa_rev = Q29_1,
         responsive_eval_gov_rev = Q31_1,
         responsive_account_rev = Q32_1
      ) %>%
      mutate(
         ans_phone = ifelse(
            !is.na(Q12_1),
            Q12_1 / 100,
            Q22_1 / 100
         ),
         ans_quest = ifelse(
            !is.na(Q12_2),
            Q12_2 / 100,
            Q22_2 / 100
         ),
         familiarity = recode(
            familiarity,
            `Unfamiliar (2/4)` = 2,
            `Familiar (3/4)` = 3,
            `Very familiar (4/4)` = 4,
            `Very unfamiliar (1/4)` = 1,
            `Don’t know` = NA_real_
         ),
         optimism = case_when(
            grepl("1", optimism) ~ 1,
            grepl("2", optimism) ~ 2,
            grepl("3", optimism) ~ 3,
            grepl("4", optimism) ~ 4,
            grepl("5", optimism) ~ 5,
            TRUE ~ NA_real_
         ),
         wave = wave_chr,
         pilot_first = as.integer(grepl("PILOT", Group)),
         compliance_first = as.integer(grepl("COMPLIANCE", Group)),
         position = recode(
            position,
            `Graduate Student` = "gradstudent",
            `Graduate student` = "gradstudent",
            `Post-Doctoral Fellow` = "postdoc",
            `Faculty` = "faculty",
            `Research Manager` = "researchstaffother",
            `Research Assistant` = "researchstaffother",
            `Other Research Staff` = "researchstaffother",
            `Administrative Staff` = "researchstaffother",
            `Other` = "researchstaffother"
         )
      ) %>%
      select(wave, starts_with("call"), starts_with("responsive"), familiarity, optimism, pilot_first, compliance_first, position, starts_with("ans_"))
}

# Load UCSD data
qdat <- read_csv("../1_data/4_forecasting/FORECASTING+THE+EFFECTS+OF+IVR+COMMUNICATION+BETWEEN+POLITICIANS+AND+CITIZENS_March+27,+2019_09.27.csv") %>%
   clean_qualtrics(., "UCSD")

# Load CERP
cerp_file <- "../1_data/4_forecasting/CERP+-+FORECASTING+THE+EFFECTS+OF+IVR+COMMUNICATION+BETWEEN+POLITICIANS+AND+CITIZENS_May+20,+2019_15.00_ed.csv"
cerphead <- as.character(read_csv(cerp_file, n_max = 1, col_names = FALSE)[1, ])
cerpdat <- read_csv(
   cerp_file,
   skip = 3,
   col_names = cerphead
) %>%
   clean_qualtrics(., "CERP")

# US (grad students before May 22)
usdat <- read_rds("../1_data/4_forecasting/usdat.rds")

# Grad students (UCLA and Stanford)
graddat <- usdat %>%
   filter(recorded_date < dmy("22-05-19")) %>%
   clean_qualtrics(., "GradStudents")

# CP list
cpdat <- usdat %>%
   filter(recorded_date >= dmy("22-05-19")) %>%
   clean_qualtrics(., "CPList")

# IMS students
imdat <- read_xlsx("../1_data/4_forecasting/20190524_peshawar1 .xlsx", na = c(".", "DK")) %>%
   bind_rows(read_xlsx("../1_data/4_forecasting/20190524_peshawar2.xlsx", na = c(".", "DK"))) %>%
   bind_rows(read_xlsx("../1_data/4_forecasting/20190524_peshawar3.xlsx", na = c(".", "DK"))) %>%
   clean_excel(., "IMS") %>%
   mutate(
      position = recode(
         `Teacher/Student`,
         `TEACHER` = "faculty",
         `STUDENT` = "undergraduate"
      )
   )

# Merge data together
all <- bind_rows(input, qdat, cerpdat, graddat, cpdat, imdat) %>%
   mutate(
      # Bin variables
      optimism_group = recode(
         optimism,
         `1` = "Pessimistic",
         `2` = "Pessimistic",
         `3` = "Neutral",
         `4` = "Optimistic",
         `5` = "Optimistic"
      ),
      familiar = as.numeric(familiarity > 2.5),
      # Accuracy of forecasts
      abs_error =
         abs(call_eval_mpa - 0.015) +
            abs(call_eval_gov - 0.022) +
            abs(call_account - 0.001) +
            abs(responsive_eval_mpa + 0.02) +
            abs(responsive_eval_gov - 0.018) +
            abs(responsive_account + 0.007),
      # ID for long, wide reshapes
      unique_id = 1:n()
   ) %>%
   mutate_at(
      vars(starts_with("ans_")),
      ~ {
         ifelse(. > 1.000001, . / 100, .)
      }
   )
all$call_avg <- rowMeans(all[, c("call_eval_mpa", "call_eval_gov", "call_account")])
all$resp_avg <- rowMeans(all[, c("responsive_account", "responsive_eval_gov", "responsive_eval_mpa")])

# Sanity checking some variables
table(is.na(all$responsive_eval_mpa), all$wave, useNA = "always")
table(is.na(all$responsive_eval_gov), all$wave, useNA = "always")
table(is.na(all$responsive_account), all$wave, useNA = "always")
table(is.na(all$call_eval_mpa_rev), all$wave, useNA = "always")

# Long version of the data
all_long <- all %>%
   gather(
      key, val,
      call_eval_mpa, call_eval_gov, call_account, call_avg,
      responsive_eval_mpa, responsive_eval_gov, responsive_account,
      ans_phone, ans_quest
   ) %>%
   # Two survey experiments in the qualtrics data only
   mutate(
      Treatment = case_when(
         grepl("call_", key) ~ "Any Call\nvs. Control",
         grepl("responsive_", key) ~ "Responsive\nvs.Generic",
         grepl("ans_", key) ~ "Compliance"
      ),
      Outcome = case_when(
         grepl("mpa", key) ~ "Evaluations of the MPA",
         grepl("gov", key) ~ "Evaluations of government",
         grepl("account", key) ~ "Prospects for accountability",
         key == "call_avg" ~ "Index Average",
         key == "ans_phone" ~ "Answer the phone",
         key == "ans_quest" ~ "Answer question given\nthey answer the phone"
      ),
      Outcome = factor(Outcome, levels = c(
         "Evaluations of the MPA",
         "Evaluations of government",
         "Prospects for accountability",
         "Index Average",
         "Answer the phone",
         "Answer question given\nthey answer the phone"
      )),
      val = ifelse(grepl("ans_", key), val * 100, val)
   )



# ==== Forecast error by group =====


# ==== Basic descriptive results ====


# correlation between answers
cor(all[, c("call_eval_mpa", "call_eval_gov", "call_account")], use = "pairwise")

# relationship with familiarity, waves, etc.
lm_robust(call_eval_mpa ~ wave + 0, all)
lm_robust(call_eval_gov ~ wave + 0, all)
lm_robust(call_account ~ wave + 0, all)
lm_robust(call_avg ~ wave + 0, all)
lm_robust(call_avg ~ position + 0, all)
lm_robust(call_avg ~ factor(familiarity) + 0, all)
lm_robust(call_avg ~ factor(familiarity), all)
lm_robust(call_avg ~ position, all)

lm_robust(responsive_eval_mpa ~ wave + 0, all)
lm_robust(responsive_eval_gov ~ wave + 0, all)
lm_robust(responsive_account ~ wave + 0, all)


lm_robust(call_eval_mpa ~ familiarity, all)
lm_robust(call_eval_mpa ~ optimism_group, all)
lm_robust(call_eval_mpa ~ familiarity * optimism, all)
lm_robust(call_avg ~ familiarity * optimism, all)

# Simple averages and cross-tabs, averages by group
all %>%
   summarize_at(
      vars(optimism, familiarity, starts_with("call"), starts_with("responsive"), -ends_with("rev")),
      list(~ mean(., na.rm = TRUE))
   ) %>%
   as.data.frame() %>%
   t() %>%
   round(., 3)

all %>%
   group_by(optimism) %>%
   summarize_at(
      vars(familiarity, starts_with("ans"), starts_with("call"), starts_with("responsive")),
      list(avg = ~ round(mean(., na.rm = TRUE), 3))
   ) %>%
   as.data.frame() %>%
   t()

# ===== Forecast hists =====

all_long_2 <- all_long %>%
   filter(!grepl("Responsive", Treatment), !grepl("Index", Outcome))
avgs <- all_long_2 %>%
   group_by(Outcome, Treatment) %>%
   summarize(avg = mean(val, na.rm = TRUE))

truth <- avgs %>%
   select(Treatment, Outcome) %>%
   mutate(
      truth = case_when(
         grepl("Call", Treatment) & grepl("MPA", Outcome) ~ 0.015,
         grepl("Call", Treatment) & grepl("gover", Outcome) ~ 0.022,
         grepl("Call", Treatment) & grepl("account", Outcome) ~ 0.001,
         grepl("Answer the phone", Outcome) ~ 73.1,
         grepl("Answer q", Outcome) ~ 23.8
      )
   )

pvals <- all_long_2 %>%
   left_join(truth) %>%
   filter(!grepl("Average", Outcome)) %>%
   group_by(Outcome, Treatment) %>%
   summarize(pvalue = t.test(val, mu = truth[1])[["p.value"]])

text_size <- 2.5
compliance_forecast_bars <- ggplot(all_long_2 %>% filter(grepl("Answer", Outcome)), aes(x = val)) +
   geom_histogram(binwidth = 5) +
   facet_wrap(Outcome ~ ., strip.position = "top", ncol = 1) +
   geom_vline(data = avgs %>% filter(grepl("Answer", Outcome)), aes(xintercept = avg), color = "grey30", linetype = 2) +
   geom_text(data = avgs %>% filter(grepl("Answer", Outcome)), aes(x = avg + ifelse(avg < 53, -14, 14), y = 58, label = paste0("Avg. forecast %\n", round(avg, 1))), color = "grey30", size = 3) +
   ylim(0, 70) +
   ggtitle("Panel A: Forecasts of Compliance") +
   ylab("N of Forecasters") +
   xlab("Estimated Compliance %") +
   theme_minimal(base_size = 10) +
   theme(strip.text = element_text(size = 7), axis.text = element_text(color = "black"))

index_forecast_bars <- ggplot(
   data = all_long_2 %>% filter(!grepl("Answer", Outcome)),
   aes(x = val)
) +
   geom_vline(xintercept = 0, linetype = 2, size = 0.4, color = "grey80") +
   geom_histogram() +
   facet_wrap(Outcome ~ ., strip.position = "top", ncol = 1) +
   geom_vline(data = avgs %>% filter(!grepl("Answer", Outcome)), aes(xintercept = avg), color = "grey30", linetype = 2) +
   geom_text(data = avgs %>% filter(!grepl("Answer", Outcome)), aes(x = avg + 0.09, y = 85, label = paste0("Avg. forecast ITT\n", round(avg, 3))), color = "grey30", size = 3) +
   ggtitle("Panel B: Forecasts of Treatment Effects") +
   ylab("N of Forecasters") +
   xlab("Estimated Effects in SDs") +
   theme_minimal(base_size = 10) +
   theme(strip.text = element_text(size = 7), axis.text = element_text(color = ))

compliance_forecast_bars_pvals <- compliance_forecast_bars +
   geom_vline(data = truth %>% filter(grepl("Answer", Outcome)), aes(xintercept = truth), color = "red", linetype = 3) +
   geom_text(data = truth %>% filter(grepl("Answer", Outcome)), aes(x = truth + ifelse(truth < 50, -12, 12), y = 58, label = paste0("Realized %\n", truth)), size = 3, color = "red")

index_forecast_bars_pvals <- index_forecast_bars +
   geom_vline(data = truth %>% filter(!grepl("Answer", Outcome)), aes(xintercept = truth), color = "red", linetype = 3) +
   geom_text(data = truth %>% filter(!grepl("Answer", Outcome)), aes(x = truth - 0.075, y = 85, label = paste0("Realized ITT\n", truth)), size = 3, color = "red")

pdf(file = "../3_output/1_figures/fig-L1.pdf", width = 5, height = 6.5, family = "Times")
grid.arrange(compliance_forecast_bars_pvals, index_forecast_bars_pvals, nrow = 2, heights = c(2.2, 3))
dev.off()


######
# END OF SCRIPT
#####
